{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "be2ef5d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Figuring out Daria's files\n",
    "\n",
    "import os\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4c9c73ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "DIR = \"/Users/dromar/Documents/MyDrive/Research/ACL/data/Final_Data_Files/FromDaria/July2025\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e6412ac4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/zl/x_m9bbzs7f5bwbgjmfs354y9ygn_j5/T/ipykernel_20051/4074684352.py:1: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  file1 = pd.read_csv(os.path.join(DIR, \"authors_cv_fos_final_cleaned.csv\"))\n"
     ]
    },
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "author_id",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "year",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "fieldsOfStudy",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "fieldsUntilYear_cleaned",
         "rawType": "object",
         "type": "string"
        }
       ],
       "ref": "7da27e23-7da7-4033-a8d8-844174c561f8",
       "rows": [
        [
         "753167",
         "9823812",
         "2024",
         "['Computer Science, Mathematics']",
         "Computer Science, Mathematics"
        ],
        [
         "753168",
         "2276518672",
         "2024",
         "['Computer Science, Mathematics']",
         "Computer Science, Mathematics"
        ],
        [
         "753169",
         "2276518213",
         "2024",
         "['Computer Science, Mathematics']",
         "Computer Science, Mathematics"
        ],
        [
         "753170",
         "2286443286",
         "2024",
         "['Computer Science, Mathematics']",
         "Computer Science, Mathematics"
        ],
        [
         "753171",
         "2322724001",
         "2024",
         "['Computer Science']",
         "Computer Science"
        ],
        [
         "753172",
         "2300249072",
         "2024",
         "['Computer Science']",
         "Computer Science"
        ],
        [
         "753173",
         "2300249218",
         "2024",
         "['Computer Science']",
         "Computer Science"
        ]
       ],
       "shape": {
        "columns": 4,
        "rows": 7
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author_id</th>\n",
       "      <th>year</th>\n",
       "      <th>fieldsOfStudy</th>\n",
       "      <th>fieldsUntilYear_cleaned</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>753167</th>\n",
       "      <td>9823812</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science, Mathematics']</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753168</th>\n",
       "      <td>2276518672</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science, Mathematics']</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753169</th>\n",
       "      <td>2276518213</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science, Mathematics']</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753170</th>\n",
       "      <td>2286443286</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science, Mathematics']</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753171</th>\n",
       "      <td>2322724001</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science']</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753172</th>\n",
       "      <td>2300249072</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science']</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753173</th>\n",
       "      <td>2300249218</td>\n",
       "      <td>2024</td>\n",
       "      <td>['Computer Science']</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         author_id  year                      fieldsOfStudy  \\\n",
       "753167     9823812  2024  ['Computer Science, Mathematics']   \n",
       "753168  2276518672  2024  ['Computer Science, Mathematics']   \n",
       "753169  2276518213  2024  ['Computer Science, Mathematics']   \n",
       "753170  2286443286  2024  ['Computer Science, Mathematics']   \n",
       "753171  2322724001  2024               ['Computer Science']   \n",
       "753172  2300249072  2024               ['Computer Science']   \n",
       "753173  2300249218  2024               ['Computer Science']   \n",
       "\n",
       "              fieldsUntilYear_cleaned  \n",
       "753167  Computer Science, Mathematics  \n",
       "753168  Computer Science, Mathematics  \n",
       "753169  Computer Science, Mathematics  \n",
       "753170  Computer Science, Mathematics  \n",
       "753171               Computer Science  \n",
       "753172               Computer Science  \n",
       "753173               Computer Science  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file1 = pd.read_csv(os.path.join(DIR, \"authors_cv_fos_final_cleaned.csv\"))\n",
    "file1.tail(7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6f217deb",
   "metadata": {},
   "outputs": [],
   "source": [
    "file1['fieldOfStudy'] = file1['fieldsUntilYear_cleaned']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0dd8c332",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "paper_id",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "author_ids",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "fieldsUntilYear",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "authors_final_FOS",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "subdomain",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "year",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "main_FOS",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "aggregated_fields",
         "rawType": "object",
         "type": "string"
        }
       ],
       "ref": "ea258f7d-acfd-49ab-896c-281b65a377ab",
       "rows": [
        [
         "0",
         "698b9956f28df9cb2484ead9b4a111ab6b01e5a4",
         "2020979, 1984556, 2111046427, 31481989, 2628881",
         "Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2014",
         "Computer Science, Engineering",
         "Computer Science"
        ],
        [
         "1",
         "11608f935502346a28907a5a092ace4a18caa2df",
         "3189711",
         "Computer Science",
         "Computer Science",
         "CV&AI",
         "2014",
         null,
         "Computer Science"
        ],
        [
         "2",
         "409f039019e058b2c9b041e871850ff37ce98861",
         "2821832, 3246183, 2139481, 144000830",
         "Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2014",
         "Computer Science",
         "Computer Science"
        ],
        [
         "3",
         "138a5500b025fdbbdfa6ed272b1aaace1f5acd1d",
         "2961231, 1747909",
         "Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Materials Science, Computer Science, Engineering, Computer Science, Engineering, Mathematics, Engineering, Engineering, Engineering, Computer Science, Computer Science",
         "Computer Science, Medicine, Mathematics, Art",
         "CV&AI",
         "2014",
         "Computer Science, Engineering",
         "Computer Science, Medicine, Mathematics, Art"
        ],
        [
         "4",
         "1697b53565696f3e542433a59e15a75b5ca64a92",
         "1780381, 2254178",
         "Computer Science",
         "Computer Science",
         "CV&AI",
         "2014",
         null,
         "Computer Science"
        ]
       ],
       "shape": {
        "columns": 8,
        "rows": 5
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id</th>\n",
       "      <th>author_ids</th>\n",
       "      <th>fieldsUntilYear</th>\n",
       "      <th>authors_final_FOS</th>\n",
       "      <th>subdomain</th>\n",
       "      <th>year</th>\n",
       "      <th>main_FOS</th>\n",
       "      <th>aggregated_fields</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>698b9956f28df9cb2484ead9b4a111ab6b01e5a4</td>\n",
       "      <td>2020979, 1984556, 2111046427, 31481989, 2628881</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Engineering</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11608f935502346a28907a5a092ace4a18caa2df</td>\n",
       "      <td>3189711</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>409f039019e058b2c9b041e871850ff37ce98861</td>\n",
       "      <td>2821832, 3246183, 2139481, 144000830</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>138a5500b025fdbbdfa6ed272b1aaace1f5acd1d</td>\n",
       "      <td>2961231, 1747909</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science, Medicine, Mathematics, Art</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Engineering</td>\n",
       "      <td>Computer Science, Medicine, Mathematics, Art</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1697b53565696f3e542433a59e15a75b5ca64a92</td>\n",
       "      <td>1780381, 2254178</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   paper_id  \\\n",
       "0  698b9956f28df9cb2484ead9b4a111ab6b01e5a4   \n",
       "1  11608f935502346a28907a5a092ace4a18caa2df   \n",
       "2  409f039019e058b2c9b041e871850ff37ce98861   \n",
       "3  138a5500b025fdbbdfa6ed272b1aaace1f5acd1d   \n",
       "4  1697b53565696f3e542433a59e15a75b5ca64a92   \n",
       "\n",
       "                                        author_ids  \\\n",
       "0  2020979, 1984556, 2111046427, 31481989, 2628881   \n",
       "1                                          3189711   \n",
       "2             2821832, 3246183, 2139481, 144000830   \n",
       "3                                 2961231, 1747909   \n",
       "4                                 1780381, 2254178   \n",
       "\n",
       "                                     fieldsUntilYear  \\\n",
       "0  Computer Science, Computer Science, Computer S...   \n",
       "1                                   Computer Science   \n",
       "2  Computer Science, Computer Science, Computer S...   \n",
       "3  Computer Science, Computer Science, Computer S...   \n",
       "4                                   Computer Science   \n",
       "\n",
       "                              authors_final_FOS subdomain  year  \\\n",
       "0                              Computer Science     CV&AI  2014   \n",
       "1                              Computer Science     CV&AI  2014   \n",
       "2                              Computer Science     CV&AI  2014   \n",
       "3  Computer Science, Medicine, Mathematics, Art     CV&AI  2014   \n",
       "4                              Computer Science     CV&AI  2014   \n",
       "\n",
       "                        main_FOS                             aggregated_fields  \n",
       "0  Computer Science, Engineering                              Computer Science  \n",
       "1                            NaN                              Computer Science  \n",
       "2               Computer Science                              Computer Science  \n",
       "3  Computer Science, Engineering  Computer Science, Medicine, Mathematics, Art  \n",
       "4                            NaN                              Computer Science  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file2 = pd.read_csv(os.path.join(DIR, \"CVAI_final_version_FOS_July25.csv\"))\n",
    "file2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a2ae055c",
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_ai_df = file1.copy()\n",
    "df = file2.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a3ed3b03",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df = df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a858518c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/73424 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 73424/73424 [02:40<00:00, 457.79it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                   paper_id  \\\n",
      "0  698b9956f28df9cb2484ead9b4a111ab6b01e5a4   \n",
      "1  11608f935502346a28907a5a092ace4a18caa2df   \n",
      "2  409f039019e058b2c9b041e871850ff37ce98861   \n",
      "3  138a5500b025fdbbdfa6ed272b1aaace1f5acd1d   \n",
      "4  1697b53565696f3e542433a59e15a75b5ca64a92   \n",
      "\n",
      "                                     fieldsUntilYear  \\\n",
      "0  Computer Science, Materials Science, History, ...   \n",
      "1                                   Computer Science   \n",
      "2  Materials Science, History, Mathematics, Art, ...   \n",
      "3  Computer Science, Medicine, Mathematics, Art, ...   \n",
      "4                 Computer Science, Computer Science   \n",
      "\n",
      "                              main_FOS  \n",
      "0  Computer Science, Materials Science  \n",
      "1                     Computer Science  \n",
      "2  Computer Science, Materials Science  \n",
      "3           Computer Science, Medicine  \n",
      "4                     Computer Science  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from collections import Counter\n",
    "from tqdm import tqdm\n",
    "\n",
    "# --- Register tqdm for pandas apply ---\n",
    "tqdm.pandas()\n",
    "# --- Define helper function to determine dominant or top fields ---\n",
    "\n",
    "def get_top_fields(fields, threshold=0.9):\n",
    "    \"\"\"\n",
    "    Given a string of comma-separated fields, determine the main field(s).\n",
    "\n",
    "    If one field appears at least 'threshold' proportion (default 90%) of times,\n",
    "    it is considered dominant and returned alone.\n",
    "\n",
    "    Otherwise, return the two most common fields, joined by a comma.\n",
    "    \"\"\"\n",
    "    # Handle missing or empty input\n",
    "    if pd.isna(fields) or fields == '' or not fields:\n",
    "        return ''\n",
    "    \n",
    "    # Split string into a list, remove empty entries, and strip spaces\n",
    "    field_list = [f.strip() for f in fields.split(',') if f.strip()]\n",
    "    \n",
    "    if not field_list:\n",
    "        return ''\n",
    "    \n",
    "    # Count how many times each field appears\n",
    "    field_counts = Counter(field_list)\n",
    "    total = sum(field_counts.values())\n",
    "    most_common = field_counts.most_common()\n",
    "\n",
    "    # Check if the most common field passes the threshold\n",
    "    if most_common[0][1] / total >= threshold:\n",
    "        return most_common[0][0]\n",
    "    else:\n",
    "        # If no single dominant field, return top two fields joined\n",
    "        top_fields = [field for field, _ in most_common[:2]]\n",
    "        return ', '.join(top_fields)\n",
    "\n",
    "# --- Define function to collect fields from authors for each paper ---\n",
    "\n",
    "def collect_fields(author_ids_str, paper_year, author_df):\n",
    "    \"\"\"\n",
    "    For a given paper, get all fields of study from its authors for that year.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    author_ids_str : str\n",
    "        Comma-separated string of author IDs for the paper.\n",
    "    paper_year : int or float\n",
    "        Year of the paper, used to match author fields.\n",
    "    author_df : pd.DataFrame\n",
    "        DataFrame with author_id, year, and fieldOfStudy columns.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    str\n",
    "        Combined comma-separated string of all fields from authors.\n",
    "    \"\"\"\n",
    "    # Check for missing or empty author list\n",
    "    if pd.isna(author_ids_str) or author_ids_str.strip() == \"\":\n",
    "        return \"\"\n",
    "    \n",
    "    # Parse author IDs string into a list of integers (skip 'None')\n",
    "    try:\n",
    "        author_ids = [\n",
    "            int(a.strip()) for a in author_ids_str.split(\",\") \n",
    "            if a.strip() and a.strip() != \"None\"\n",
    "        ]\n",
    "    except ValueError:\n",
    "        # If parsing fails (bad IDs), return empty\n",
    "        return \"\"\n",
    "    \n",
    "    fields = []\n",
    "\n",
    "    # Loop through each author ID\n",
    "    for aid in author_ids:\n",
    "        # Find author row matching ID and paper year\n",
    "        row = author_df[\n",
    "            (author_df['author_id'] == aid) & \n",
    "            (author_df['year'] == paper_year)\n",
    "        ]\n",
    "        if not row.empty:\n",
    "            # If found, get fieldOfStudy string and add to list\n",
    "            fields.append(row.iloc[0]['fieldOfStudy'])\n",
    "    \n",
    "    # Join all collected fields into one string\n",
    "    return \", \".join(fields)\n",
    "\n",
    "# --- Create 'fieldsUntilYear' column in paper DataFrame ---\n",
    "\n",
    "# Apply the collection function to each row in merged_df\n",
    "merged_df['fieldsUntilYear'] = merged_df.progress_apply(\n",
    "    lambda row: collect_fields(row['author_ids'], row['year'], cv_ai_df),\n",
    "    axis=1\n",
    ")\n",
    "\n",
    "# --- Create 'main_FOS' column using threshold logic ---\n",
    "\n",
    "# Apply get_top_fields function on each fieldsUntilYear string\n",
    "merged_df['main_FOS'] = merged_df['fieldsUntilYear'].apply(\n",
    "    lambda x: get_top_fields(x, threshold=0.9)  # You can change threshold here (e.g., 0.7)\n",
    ")\n",
    "\n",
    "# --- Check and print results ---\n",
    "\n",
    "# Print columns of interest for review\n",
    "print(merged_df[['paper_id', 'fieldsUntilYear', 'main_FOS']].head())\n",
    "\n",
    "# Optional: save to CSV\n",
    "# merged_df.to_csv(\"papers_with_main_FOS.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1d243ca9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(99501, 28)\n"
     ]
    }
   ],
   "source": [
    "#### load the consolidated file\n",
    "alt_DIR = \"/Users/dromar/Documents/MyDrive/Research/ACL/Replication_SocietalAI\"\n",
    "DATA_DIR = os.path.join(alt_DIR, \"Data\")\n",
    "df2 = pd.read_csv(os.path.join(DATA_DIR, \"processed_June2025_final_data.csv\"))\n",
    "print(df2.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "bbe75350",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df2 sample:\n",
      "                                   paper_id              authors_final_FOS\n",
      "0  0000fcfd467a19cf0e59169c2f07d730a0f3a8b9               Computer Science\n",
      "1  000178cd12c8a6e5da8215b6365fae03c20fd18d               Computer Science\n",
      "2  0002281d45e90ebd3add330f5960973a24d68e1a               Computer Science\n",
      "3  0002378302c911af9a3cad99955d6c8cf2c6ddf3               Computer Science\n",
      "4  00035282b3fc11736967f03b6750b2ffc5fef42f  Computer Science, Mathematics\n",
      "5  0003652f523f5972033aeebdd287af018c9a4833               Computer Science\n",
      "6  000429c966f527613c700947a0912b10ce3f359d               Computer Science\n",
      "7  0005ba64c6bbdc29fc88a4d13e40b4827afc2168  Computer Science, Mathematics\n",
      "\n",
      "merged_df sample:\n",
      "                                   paper_id              authors_final_FOS\n",
      "0  0000fcfd467a19cf0e59169c2f07d730a0f3a8b9               Computer Science\n",
      "1  000178cd12c8a6e5da8215b6365fae03c20fd18d               Computer Science\n",
      "2  0002281d45e90ebd3add330f5960973a24d68e1a               Computer Science\n",
      "3  0002378302c911af9a3cad99955d6c8cf2c6ddf3               Computer Science\n",
      "4  00035282b3fc11736967f03b6750b2ffc5fef42f  Computer Science, Mathematics\n",
      "5  0003652f523f5972033aeebdd287af018c9a4833               Computer Science\n",
      "6  000429c966f527613c700947a0912b10ce3f359d               Computer Science\n",
      "7  0005ba64c6bbdc29fc88a4d13e40b4827afc2168  Computer Science, Mathematics\n"
     ]
    }
   ],
   "source": [
    "# Get set of paper_ids from merged_df\n",
    "merged_ids = set(merged_df[\"paper_id\"])\n",
    "\n",
    "# Filter df2 to rows where paper_id is in merged_df\n",
    "df2_matched = df2[df2[\"paper_id\"].isin(merged_ids)]\n",
    "\n",
    "# Filter merged_df to same paper_ids\n",
    "merged_df_matched = merged_df[merged_df[\"paper_id\"].isin(df2_matched[\"paper_id\"])]\n",
    "\n",
    "# Sort both by paper_id (optional, to align them visually)\n",
    "df2_matched_sorted = df2_matched.sort_values(\"paper_id\").reset_index(drop=True)\n",
    "merged_df_matched_sorted = merged_df_matched.sort_values(\"paper_id\").reset_index(drop=True)\n",
    "\n",
    "# Print head of both for comparison\n",
    "print(\"df2 sample:\")\n",
    "print(df2_matched_sorted[[\"paper_id\", \"authors_final_FOS\"]].head(8))\n",
    "print(\"\\nmerged_df sample:\")\n",
    "print(merged_df_matched_sorted[[\"paper_id\", \"authors_final_FOS\"]].head(8))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "58cdadbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# First, create a mapping from merged_df\n",
    "fos_mapping = merged_df.set_index(\"paper_id\")[\"authors_final_FOS\"].to_dict()\n",
    "\n",
    "# Then, update df2['authors_final_FOS'] using this mapping\n",
    "df2[\"authors_final_FOS\"] = df2[\"paper_id\"].map(fos_mapping).combine_first(df2[\"authors_final_FOS\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "228cefc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.to_csv(os.path.join(DATA_DIR, \"processed_June2025_final_data_updated.csv\"), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7d16ee79",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 73424/73424 [02:07<00:00, 576.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                   paper_id  \\\n",
      "0  698b9956f28df9cb2484ead9b4a111ab6b01e5a4   \n",
      "1  11608f935502346a28907a5a092ace4a18caa2df   \n",
      "2  409f039019e058b2c9b041e871850ff37ce98861   \n",
      "3  138a5500b025fdbbdfa6ed272b1aaace1f5acd1d   \n",
      "4  1697b53565696f3e542433a59e15a75b5ca64a92   \n",
      "\n",
      "                                     fieldsUntilYear  \\\n",
      "0  Computer Science, Materials Science, History, ...   \n",
      "1                                   Computer Science   \n",
      "2  Materials Science, History, Mathematics, Art, ...   \n",
      "3  Computer Science, Medicine, Mathematics, Art, ...   \n",
      "4                 Computer Science, Computer Science   \n",
      "\n",
      "                              main_FOS                         threshold_75  \\\n",
      "0  Computer Science, Materials Science  Computer Science, Materials Science   \n",
      "1                     Computer Science                     Computer Science   \n",
      "2  Computer Science, Materials Science  Computer Science, Materials Science   \n",
      "3           Computer Science, Medicine           Computer Science, Medicine   \n",
      "4                     Computer Science                     Computer Science   \n",
      "\n",
      "                          threshold_80  \n",
      "0  Computer Science, Materials Science  \n",
      "1                     Computer Science  \n",
      "2  Computer Science, Materials Science  \n",
      "3           Computer Science, Medicine  \n",
      "4                     Computer Science  \n"
     ]
    }
   ],
   "source": [
    "##### Now, do robustness with 75 and 80\n",
    "\n",
    "### 75\n",
    "\n",
    "\n",
    "# Apply the collection function to each row in merged_df\n",
    "merged_df['fieldsUntilYear'] = merged_df.progress_apply(\n",
    "    lambda row: collect_fields(row['author_ids'], row['year'], cv_ai_df),\n",
    "    axis=1\n",
    ")\n",
    "\n",
    "# --- Create 'main_FOS' column using threshold logic ---\n",
    "\n",
    "# Apply get_top_fields function on each fieldsUntilYear string\n",
    "merged_df['threshold_75'] = merged_df['fieldsUntilYear'].apply(\n",
    "    lambda x: get_top_fields(x, threshold=0.75)  # You can change threshold here (e.g., 0.7)\n",
    ")\n",
    "\n",
    "# Apply get_top_fields function on each fieldsUntilYear string\n",
    "merged_df['threshold_80'] = merged_df['fieldsUntilYear'].apply(\n",
    "    lambda x: get_top_fields(x, threshold=0.8)  # You can change threshold here (e.g., 0.7)\n",
    ")\n",
    "\n",
    "\n",
    "# --- Check and print results ---\n",
    "\n",
    "# Print columns of interest for review\n",
    "print(merged_df[['paper_id', 'fieldsUntilYear', 'main_FOS','threshold_75','threshold_80']].head())\n",
    "\n",
    "# Optional: save to CSV\n",
    "# merged_df.to_csv(\"papers_with_main_FOS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "dc3fa540",
   "metadata": {},
   "outputs": [],
   "source": [
    "### find rows where main_FOS is different from threshold_75 or threshold_80\n",
    "# Find rows where main_FOS is different from threshold_75 or threshold_80\n",
    "different_from_75 = merged_df[merged_df['main_FOS'] != merged_df['threshold_75']]\n",
    "different_from_80 = merged_df[merged_df['main_FOS'] != merged_df['threshold_80']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0f081532",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "paper_id",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "author_ids",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "fieldsUntilYear",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "authors_final_FOS",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "subdomain",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "year",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "main_FOS",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "aggregated_fields",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "threshold_75",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "threshold_80",
         "rawType": "object",
         "type": "string"
        }
       ],
       "ref": "66d469c9-925b-42f9-8949-8b92a41adbf4",
       "rows": [
        [
         "198",
         "1eb3feb77ea540f2551920843258b8de70c9b9dd",
         "2161037, 36547165, 1780381",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science, Medicine, Mathematics",
         "CV&AI",
         "2014",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "310",
         "198f325aaadd3bfb4877b1fbbe7416bde1eb4b62",
         "1786819, 1877377, 2133884, 2734293, 1992077",
         "Computer Science, Mathematics, Computer Science, Computer Science, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2014",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "669",
         "8e07ece8543f9431f03b66c0b296cedb1435e667",
         "1745524, 2133680, 1753210",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2014",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "715",
         "1cb2e7daf2a1577aaa4e5ee5567158e9e25be4e5",
         "144220896, 2112738462, 2111417781",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2014",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "1239",
         "258af67f151384df3612a77466f3cb1502dd860a",
         "1403126415, 1734140, 1712460",
         "Computer Science, Computer Science, Computer Science, Mathematics",
         "Computer Science",
         "CV&AI",
         "2014",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "1284",
         "6f2f2c895f0141cc770dd72a28c34a51cb772850",
         "2161037, 1780381, 36547165, 5546141, None",
         "Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2014",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "1471",
         "2451f678ef922ca69010ba3096d49ed5ae063f7a",
         "2110092635, 1838178, 2108366320, 2108344510",
         "Computer Science, Computer Science, Computer Science, Computer Science, Engineering",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Engineering",
         "Computer Science, Engineering",
         "Computer Science",
         "Computer Science"
        ],
        [
         "1560",
         "9cbbb518b2d607cd3a8081523550d89623490cd6",
         "2109913, 1793182, 2143348956, 144633617",
         "Computer Science, Computer Science, Computer Science, Computer Science, Mathematics",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "1662",
         "c227db19df71d9f8ab3cf2fed8fdbd144f31b298",
         "1734018, 21807263, 145820073",
         "Computer Science, Physics, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Physics",
         "Computer Science, Physics",
         "Computer Science",
         "Computer Science, Physics"
        ],
        [
         "1786",
         "0848d1cedbb34ee53dc6242bfacdda2ecdc545aa",
         "2462485, 12636684, 1780587",
         "Computer Science, Computer Science, Computer Science, Medicine",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Medicine",
         "Computer Science, Medicine",
         "Computer Science",
         "Computer Science, Medicine"
        ],
        [
         "1831",
         "e10a37b06b7534a920b3b68d87412fcfc3a90d6c",
         "2024834167, 2059899475, 144133280",
         "Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "1896",
         "dacaf094b2c6d67a8e60a572cb935bf2a48336d0",
         "2254178, 2604251, 1780381",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "2029",
         "c5529d007bd56dc1e5d67cd7a9d97edacecff4a1",
         "2353495, 2741749, 3336450, 145541445",
         "Computer Science, Computer Science, Computer Science, Computer Science, Mathematics",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "2068",
         "c74ab25d3ae4e7242524776aedb46d4191f606df",
         "143892297, 1680933, 1681789",
         "Mathematics, Computer Science, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Mathematics, Computer Science",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "2168",
         "4b92e24678985feac4225386a9b9f4c76e09195b",
         "143761112, 3110004, 2680543",
         "Computer Science, Computer Science, Computer Science, Geology",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Geology",
         "Computer Science, Geology",
         "Computer Science",
         "Computer Science, Geology"
        ],
        [
         "2189",
         "6364fdaa0a0eccd823a779fcdd489173f938e91a",
         "1737326, 152702479, 1710872",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "2227",
         "3b466bb66ee79c8e9bcdb6cf9acb54b864dda735",
         "1844358, 48811777, 1904850, 143856428, 145380991",
         "Computer Science, Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "2312",
         "1f45b6d800108a277f4b2866420eefee3d2581e0",
         "2604251, 1780381, 145950884, 5546141",
         "Computer Science, Mathematics, Computer Science, Computer Science, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "2658",
         "4eb082956ea3f9b2d83936c41893e385d8cf8918",
         "34645200, 1626124667, 97829218",
         "Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "2675",
         "79653d7bd70f23061b8e84cdccaf5b2ca7b75b69",
         "49719535, 143786885, 1682747, 3159766",
         "Computer Science, Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2015",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "3375",
         "dddf11a1865cd66e683af2b84287818773c73908",
         "3300969, 7607499, 2106694700, 1759899, 1697493",
         "Computer Science, Computer Science, Computer Science, Engineering, Computer Science, Computer Science",
         "Computer Science, Engineering",
         "CV&AI",
         "2016",
         "Computer Science, Engineering",
         "Computer Science, Engineering",
         "Computer Science",
         "Computer Science"
        ],
        [
         "3596",
         "92820216a6464de8e654589fcdaaebde044778cb",
         "49061049, 2975978, 1737830, 1969847",
         "Computer Science, Computer Science, Computer Science, Medicine, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Medicine",
         "Computer Science, Medicine",
         "Computer Science",
         "Computer Science"
        ],
        [
         "3707",
         "d1053c6414accab4d15841bbdfad0a32e130a3fe",
         "2110974670, 2161037, 1780381, 5546141",
         "Computer Science, Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "3983",
         "84fc1b09365197299c6a72196022d09f014ae039",
         "1859486, 2108329894, 2096527",
         "Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "4091",
         "eff5860709f54b8842274962983a7f4fca00a63e",
         "20615377, 34734622, 2099305",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "4098",
         "2f13482eca88aec04837b3001507587bb81c8ed4",
         "2521776, 145084658, 2096527, 145952970",
         "Computer Science, Computer Science, Computer Science, Medicine, Computer Science",
         "Medicine, Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Medicine",
         "Computer Science, Medicine",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4355",
         "6c7f040a150abf21dbcefe1f22e0f98fa184f41a",
         "144828948, 2893664, 3084614, 2876316, 48920094, 1697141",
         "Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Mathematics, Medicine, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics, Medicine",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "4467",
         "515b0c116ee6a3f5ec2c3df5f72c5c9a2622b7d7",
         "2794259, 1732855, 1681236",
         "Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "4490",
         "cf4e1453e0b7293842bbccec9b0b120bcaee746c",
         "3361515, 1718974, 145714168",
         "Computer Science, Computer Science, Computer Science, Mathematics",
         "Mathematics, Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "4532",
         "443db3d050a5aab65070e59448d1b51548e51eb5",
         "1718974, 2740943, 3361515, 145714168, 145739536",
         "Computer Science, Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4626",
         "50004c086ffd6a201a4b782281aaa930fbfe6ecf",
         "1877512, 145587209, 145774206",
         "Biology, Computer Science, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Biology",
         "Biology, Computer Science",
         "Computer Science",
         "Computer Science, Biology"
        ],
        [
         "4680",
         "e86f71ca2948d17b003a5f068db1ecb2b77827f7",
         "2698777, 37232298, 5164568, 145791315, 47971768, 30415265",
         "Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4722",
         "ca228b42de0f0311f4826292a2996d3348ca43fe",
         "48811777, 1844358, 143856428, 145380991",
         "Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4746",
         "e02f59cf876cb40233573ff78a1609f969d301cc",
         "2075601, 1735135, 6187400, 2152799887, 32921464",
         "Computer Science, Computer Science, Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4758",
         "b6a36b973a1bd61d1541fadfbe09c342f430c231",
         "2709906, 2906220, 3386660, 1781257",
         "Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4937",
         "74e0634f9c7ecc9d9a9d319c7741610823e1db2c",
         "143908242, 144574968, 1688428, 144124748",
         "Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "4996",
         "91c9482801a4c0c2f4f8402188587f0f3b94f78e",
         "3078154, 1823362, 143865718",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "5167",
         "5fd235751a9a3e79cfd7599f5e8b4ce7d7baf801",
         "2488938, 48325722, 2358803, 2228323",
         "Computer Science, Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science, Mathematics",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "5364",
         "9068fd54cdef49c17d5d3342a59d52fa964e30ee",
         "1403156722, 1848930, 1681236",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "5365",
         "32ecbbd76fdce249f9109594eee2d52a1cafdfc7",
         "3468964, 143819050, 1687690, 143740671",
         "Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "5385",
         "33903ee585965fde86f665d03ff4b945a110194e",
         "3341267, 73482045, 1692756",
         "Computer Science, Computer Science, Computer Science, Mathematics",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "5472",
         "72273978a502d9b017f732126943dc995b67c0fe",
         "1740425, 3468723, 48336229, 27379268, 14489533",
         "Computer Science, Computer Science, Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "5511",
         "3825562f65d5ac2a3e76890f0236fae4c64952e9",
         "3451934, 2965024, 71458732",
         "Computer Science, Computer Science, Biology, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Biology",
         "Computer Science, Biology",
         "Computer Science",
         "Computer Science, Biology"
        ],
        [
         "5635",
         "a4bfc7436272926d4e433b4e425b2d5aa61227dc",
         "2572430, 1780381, 1724393",
         "Medicine, Computer Science, Computer Science, Computer Science",
         "Medicine, Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Medicine",
         "Medicine, Computer Science",
         "Computer Science",
         "Computer Science, Medicine"
        ],
        [
         "5656",
         "cad4ac0d2389a89cf1955dd4788278c1e8ac1af9",
         "144828948, 2893664, 98123372, 7163113, 48920094, 1697141",
         "Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Mathematics, Medicine, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics, Medicine",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "5840",
         "4c15816f798ffe10553188e15a62fa89711a5f72",
         "3198263, 48873988, 37144787",
         "Computer Science, Computer Science, Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "5882",
         "8b00f15947810ed1fa23783b7ea1b84c16b0d327",
         "38769455, 1718767, 3337121",
         "Computer Science, Computer Science, Mathematics, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "5935",
         "f10106c3c111a90e340e61ca1c2caaa6241d56f1",
         "48873988, 1926727, 2163097, 2118214154",
         "Computer Science, Computer Science, Mathematics, Computer Science, Computer Science",
         "Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ],
        [
         "6188",
         "41d08fb733f3e50ac183490f84d6377dffccf350",
         "1934546, 144914140, 51352814",
         "Mathematics, Computer Science, Computer Science, Computer Science",
         "Mathematics, Computer Science",
         "CV&AI",
         "2016",
         "Computer Science, Mathematics",
         "Mathematics, Computer Science",
         "Computer Science",
         "Computer Science, Mathematics"
        ],
        [
         "6425",
         "18b125a47bc80c9e0e7c17a0899842d89a0614b1",
         "2241528, 2060385740, 2483916, 1697141, 40030651",
         "Computer Science, Computer Science, Computer Science, Computer Science, Computer Science, Mathematics",
         "Computer Science",
         "CV&AI",
         "2017",
         "Computer Science, Mathematics",
         "Computer Science, Mathematics",
         "Computer Science",
         "Computer Science"
        ]
       ],
       "shape": {
        "columns": 10,
        "rows": 634
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id</th>\n",
       "      <th>author_ids</th>\n",
       "      <th>fieldsUntilYear</th>\n",
       "      <th>authors_final_FOS</th>\n",
       "      <th>subdomain</th>\n",
       "      <th>year</th>\n",
       "      <th>main_FOS</th>\n",
       "      <th>aggregated_fields</th>\n",
       "      <th>threshold_75</th>\n",
       "      <th>threshold_80</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>1eb3feb77ea540f2551920843258b8de70c9b9dd</td>\n",
       "      <td>2161037, 36547165, 1780381</td>\n",
       "      <td>Computer Science, Computer Science, Mathematic...</td>\n",
       "      <td>Computer Science, Medicine, Mathematics</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>310</th>\n",
       "      <td>198f325aaadd3bfb4877b1fbbe7416bde1eb4b62</td>\n",
       "      <td>1786819, 1877377, 2133884, 2734293, 1992077</td>\n",
       "      <td>Computer Science, Mathematics, Computer Scienc...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>669</th>\n",
       "      <td>8e07ece8543f9431f03b66c0b296cedb1435e667</td>\n",
       "      <td>1745524, 2133680, 1753210</td>\n",
       "      <td>Computer Science, Computer Science, Mathematic...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>715</th>\n",
       "      <td>1cb2e7daf2a1577aaa4e5ee5567158e9e25be4e5</td>\n",
       "      <td>144220896, 2112738462, 2111417781</td>\n",
       "      <td>Computer Science, Computer Science, Mathematic...</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1239</th>\n",
       "      <td>258af67f151384df3612a77466f3cb1502dd860a</td>\n",
       "      <td>1403126415, 1734140, 1712460</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2014</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science, Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69633</th>\n",
       "      <td>9e47e2395f3570c279f771fd23c461685b95a235</td>\n",
       "      <td>2210201247, 2116023008, 2115409042, 2249716237...</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2024</td>\n",
       "      <td>Computer Science, Medicine</td>\n",
       "      <td>Computer Science, Medicine</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69670</th>\n",
       "      <td>67b0c97fc41444ff4ec31cb5cbecfee1cce4708f</td>\n",
       "      <td>2292136297, 9372087, 2261908445, 2261059515</td>\n",
       "      <td>Computer Science, Engineering, Computer Scienc...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2024</td>\n",
       "      <td>Computer Science, Engineering</td>\n",
       "      <td>Computer Science, Engineering</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70222</th>\n",
       "      <td>48c34b348205ac1d3bbc5e84e7d2103b348b6c9a</td>\n",
       "      <td>2203913675, 2282974473, 2303467071, 2238243162...</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2024</td>\n",
       "      <td>Computer Science, Medicine</td>\n",
       "      <td>Computer Science, Medicine</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71307</th>\n",
       "      <td>4c41c3da7b9615bd1c47980277c360c1cd4f52de</td>\n",
       "      <td>2296988727, 2268628460, 2155857668, 50468629, ...</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2024</td>\n",
       "      <td>Computer Science, Engineering</td>\n",
       "      <td>Computer Science, Engineering</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72253</th>\n",
       "      <td>fea07ef0059de06898197dc4ca4fabb288e4b987</td>\n",
       "      <td>2277653036, 2277692304, 2283139840, 2257114672...</td>\n",
       "      <td>Computer Science, Computer Science, Computer S...</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>CV&amp;AI</td>\n",
       "      <td>2024</td>\n",
       "      <td>Computer Science, Medicine</td>\n",
       "      <td>Computer Science, Medicine</td>\n",
       "      <td>Computer Science</td>\n",
       "      <td>Computer Science</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>634 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       paper_id  \\\n",
       "198    1eb3feb77ea540f2551920843258b8de70c9b9dd   \n",
       "310    198f325aaadd3bfb4877b1fbbe7416bde1eb4b62   \n",
       "669    8e07ece8543f9431f03b66c0b296cedb1435e667   \n",
       "715    1cb2e7daf2a1577aaa4e5ee5567158e9e25be4e5   \n",
       "1239   258af67f151384df3612a77466f3cb1502dd860a   \n",
       "...                                         ...   \n",
       "69633  9e47e2395f3570c279f771fd23c461685b95a235   \n",
       "69670  67b0c97fc41444ff4ec31cb5cbecfee1cce4708f   \n",
       "70222  48c34b348205ac1d3bbc5e84e7d2103b348b6c9a   \n",
       "71307  4c41c3da7b9615bd1c47980277c360c1cd4f52de   \n",
       "72253  fea07ef0059de06898197dc4ca4fabb288e4b987   \n",
       "\n",
       "                                              author_ids  \\\n",
       "198                           2161037, 36547165, 1780381   \n",
       "310          1786819, 1877377, 2133884, 2734293, 1992077   \n",
       "669                            1745524, 2133680, 1753210   \n",
       "715                    144220896, 2112738462, 2111417781   \n",
       "1239                        1403126415, 1734140, 1712460   \n",
       "...                                                  ...   \n",
       "69633  2210201247, 2116023008, 2115409042, 2249716237...   \n",
       "69670        2292136297, 9372087, 2261908445, 2261059515   \n",
       "70222  2203913675, 2282974473, 2303467071, 2238243162...   \n",
       "71307  2296988727, 2268628460, 2155857668, 50468629, ...   \n",
       "72253  2277653036, 2277692304, 2283139840, 2257114672...   \n",
       "\n",
       "                                         fieldsUntilYear  \\\n",
       "198    Computer Science, Computer Science, Mathematic...   \n",
       "310    Computer Science, Mathematics, Computer Scienc...   \n",
       "669    Computer Science, Computer Science, Mathematic...   \n",
       "715    Computer Science, Computer Science, Mathematic...   \n",
       "1239   Computer Science, Computer Science, Computer S...   \n",
       "...                                                  ...   \n",
       "69633  Computer Science, Computer Science, Computer S...   \n",
       "69670  Computer Science, Engineering, Computer Scienc...   \n",
       "70222  Computer Science, Computer Science, Computer S...   \n",
       "71307  Computer Science, Computer Science, Computer S...   \n",
       "72253  Computer Science, Computer Science, Computer S...   \n",
       "\n",
       "                             authors_final_FOS subdomain  year  \\\n",
       "198    Computer Science, Medicine, Mathematics     CV&AI  2014   \n",
       "310                           Computer Science     CV&AI  2014   \n",
       "669                           Computer Science     CV&AI  2014   \n",
       "715              Computer Science, Mathematics     CV&AI  2014   \n",
       "1239                          Computer Science     CV&AI  2014   \n",
       "...                                        ...       ...   ...   \n",
       "69633                         Computer Science     CV&AI  2024   \n",
       "69670                         Computer Science     CV&AI  2024   \n",
       "70222                         Computer Science     CV&AI  2024   \n",
       "71307                         Computer Science     CV&AI  2024   \n",
       "72253                         Computer Science     CV&AI  2024   \n",
       "\n",
       "                            main_FOS              aggregated_fields  \\\n",
       "198    Computer Science, Mathematics  Computer Science, Mathematics   \n",
       "310    Computer Science, Mathematics  Computer Science, Mathematics   \n",
       "669    Computer Science, Mathematics  Computer Science, Mathematics   \n",
       "715    Computer Science, Mathematics  Computer Science, Mathematics   \n",
       "1239   Computer Science, Mathematics  Computer Science, Mathematics   \n",
       "...                              ...                            ...   \n",
       "69633     Computer Science, Medicine     Computer Science, Medicine   \n",
       "69670  Computer Science, Engineering  Computer Science, Engineering   \n",
       "70222     Computer Science, Medicine     Computer Science, Medicine   \n",
       "71307  Computer Science, Engineering  Computer Science, Engineering   \n",
       "72253     Computer Science, Medicine     Computer Science, Medicine   \n",
       "\n",
       "           threshold_75                   threshold_80  \n",
       "198    Computer Science  Computer Science, Mathematics  \n",
       "310    Computer Science               Computer Science  \n",
       "669    Computer Science  Computer Science, Mathematics  \n",
       "715    Computer Science  Computer Science, Mathematics  \n",
       "1239   Computer Science  Computer Science, Mathematics  \n",
       "...                 ...                            ...  \n",
       "69633  Computer Science               Computer Science  \n",
       "69670  Computer Science               Computer Science  \n",
       "70222  Computer Science               Computer Science  \n",
       "71307  Computer Science               Computer Science  \n",
       "72253  Computer Science               Computer Science  \n",
       "\n",
       "[634 rows x 10 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "different_from_75"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d3ac3760",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['nlp_fos_authors_threshold_75.csv', 'cv_fos_authors_threshold_80.csv', 'nlp_2024_fos_authors_threshold_75.csv', 'cy_fos_authors_threshold_75.csv', 'cv_fos_authors_threshold_75.csv', 'cy_fos_authors_threshold_80.csv', 'nlp_2024_fos_authors_threshold_80.csv', 'nlp_fos_authors_threshold_80.csv']\n"
     ]
    }
   ],
   "source": [
    "#### save both these\n",
    "#### load the other threshold files:\n",
    "\n",
    "THRESH_DIR = '/Users/dromar/Documents/MyDrive/Research/ACL/code/Robustness tests/FOS/robust_test_thresholds_fos'\n",
    "print(os.listdir(THRESH_DIR))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6e72bbe5",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename75 = \"cv_fos_authors_75_2025.csv\"\n",
    "filename80 = \"cv_fos_authors_80_2025.csv\"\n",
    "\n",
    "merged_df[['paper_id', 'year', 'threshold_75']].to_csv(os.path.join(THRESH_DIR, filename75), index=False)\n",
    "merged_df[['paper_id', 'year', 'threshold_80']].to_csv(os.path.join(THRESH_DIR, filename80), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f1dcdb4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "societal_influence_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
